In [2]:
%%HTML
<style type="text/css">
table.dataframe td, table.datafreame th {
    border: 1px black solid;
    color: black;
}
tweet_id negativereason airline name tweet_location retweet_count text
0 570306133677760000 NaN Virgin America cairdin NaN 0 @VirginAmerica What @dhepburn said.
1 570301130888121984 NaN Virgin America jnardino NaN 0 @VirginAmerica plus you've added commercials t...
2 570301083672812992 NaN Virgin America yvonnalynn Lets Play 0 @VirginAmerica I didn't today... Must mean I n...
3 570301031407624000 Bad Flight Virgin America jnardino NaN 0 @VirginAmerica it's really aggressive to blast...
4 570300817074462016 Can't Tell Virgin America jnardino NaN 0 @VirginAmerica and it's a really big bad thing...
In [6]:
df.shape
Out[6]:
(14640, 7)
In [7]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14640 entries, 0 to 14639
Data columns (total 7 columns):
tweet_id          14640 non-null int64
negativereason    9178 non-null object
airline           14640 non-null object
name              14640 non-null object
tweet_location    9907 non-null object
retweet_count     14640 non-null int64
text              14640 non-null object
dtypes: int64(2), object(5)
memory usage: 800.8+ KB
In [8]:
df.isnull().sum()
Out[8]:
tweet_id             0
negativereason    5462
airline              0
name                 0
tweet_location    4733
retweet_count        0
text                 0
dtype: int64
In [9]:
df.drop(columns = ['tweet_id', 'name'], axis = 1, inplace = True)
In [10]:
df.columns
Out[10]:
Index(['negativereason', 'airline', 'tweet_location', 'retweet_count', 'text'], dtype='object')

finding the sentiment for each tweets

In [11]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
In [12]:
sid = SentimentIntensityAnalyzer()
In [13]:
sid.polarity_scores(df.text[0])
Out[13]:
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
In [14]:
df['polarity_score'] = df.text.apply(lambda x: sid.polarity_scores(x))
In [15]:
df.head()
Out[15]:
negativereason airline tweet_location retweet_count text polarity_score
0 NaN Virgin America NaN 0 @VirginAmerica What @dhepburn said. {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...
1 NaN Virgin America NaN 0 @VirginAmerica plus you've added commercials t... {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...
2 NaN Virgin America Lets Play 0 @VirginAmerica I didn't today... Must mean I n... {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...
3 Bad Flight Virgin America NaN 0 @VirginAmerica it's really aggressive to blast... {'neg': 0.246, 'neu': 0.754, 'pos': 0.0, 'comp...
4 Can't Tell Virgin America NaN 0 @VirginAmerica and it's a really big bad thing... {'neg': 0.321, 'neu': 0.679, 'pos': 0.0, 'comp...
In [16]:
df['score'] = df.polarity_score.apply(lambda x: x['compound'])
In [17]:
def find_sentiment(x):
    
    if x > 0:
        return "Positive"
    elif x < 0: 
        return "Negative"
    else:
        return "Neutral"
In [18]:
df["sentiment"] = df.score.apply(lambda x: find_sentiment(x))
In [19]:
df.head()
Out[19]:
negativereason airline tweet_location retweet_count text polarity_score score sentiment
0 NaN Virgin America NaN 0 @VirginAmerica What @dhepburn said. {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound... 0.0000 Neutral
1 NaN Virgin America NaN 0 @VirginAmerica plus you've added commercials t... {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound... 0.0000 Neutral
2 NaN Virgin America Lets Play 0 @VirginAmerica I didn't today... Must mean I n... {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound... 0.0000 Neutral
3 Bad Flight Virgin America NaN 0 @VirginAmerica it's really aggressive to blast... {'neg': 0.246, 'neu': 0.754, 'pos': 0.0, 'comp... -0.5984 Negative
4 Can't Tell Virgin America NaN 0 @VirginAmerica and it's a really big bad thing... {'neg': 0.321, 'neu': 0.679, 'pos': 0.0, 'comp... -0.5829 Negative

now we can drop polaarity score and score column

In [20]:
df.drop(columns = ['polarity_score', 'score'], axis = 1, inplace = True)
In [21]:
df.head()
Out[21]:
negativereason airline tweet_location retweet_count text sentiment
0 NaN Virgin America NaN 0 @VirginAmerica What @dhepburn said. Neutral
1 NaN Virgin America NaN 0 @VirginAmerica plus you've added commercials t... Neutral
2 NaN Virgin America Lets Play 0 @VirginAmerica I didn't today... Must mean I n... Neutral
3 Bad Flight Virgin America NaN 0 @VirginAmerica it's really aggressive to blast... Negative
4 Can't Tell Virgin America NaN 0 @VirginAmerica and it's a really big bad thing... Negative
In [22]:
df.sentiment.value_counts()
Out[22]:
Positive    6222
Negative    5153
Neutral     3265
Name: sentiment, dtype: int64
In [23]:
sns.countplot(x = 'sentiment', data = df)
Out[23]:
<matplotlib.axes._subplots.AxesSubplot at 0x7ffacb455310>
In [24]:
fig = go.Figure()

c = ['green','red','blue']

fig.add_trace(go.Bar(x = df.sentiment.value_counts().index,
                     y = df.sentiment.value_counts(),
                     marker_color = c,
                     text = df.sentiment.value_counts(),
                     textposition='auto'))

fig.update_layout(title = dict(text = "Sentiment count"))

Topic Modeling of the tweets

In [25]:
import spacy
In [26]:
nlp = spacy.load('en_core_web_sm')
In [27]:
def tokenization(s):
    text = nlp(s.lower())
    token = []
    
    for t in text:
        token.append(t.lemma_)
    
    return token
In [28]:
df["lemma"] = df.text.apply(lambda x: tokenization(x))
In [29]:
df.head()
Out[29]:
negativereason airline tweet_location retweet_count text sentiment lemma
0 NaN Virgin America NaN 0 @VirginAmerica What @dhepburn said. Neutral [@virginamerica, what, @dhepburn, say, .]
1 NaN Virgin America NaN 0 @VirginAmerica plus you've added commercials t... Neutral [@virginamerica, plus, -PRON-, have, add, comm...
2 NaN Virgin America Lets Play 0 @VirginAmerica I didn't today... Must mean I n... Neutral [@virginamerica, i, do, not, today, ..., must,...
3 Bad Flight Virgin America NaN 0 @VirginAmerica it's really aggressive to blast... Negative [@virginamerica, -PRON-, be, really, aggressiv...
4 Can't Tell Virgin America NaN 0 @VirginAmerica and it's a really big bad thing... Negative [@virginamerica, and, -PRON-, be, a, really, b...
In [30]:
def stopwords(s):
    
    l = []
    
    for i in s:
        if i not in nlp.Defaults.stop_words:
            l.append(i)
            
    
    
    return " ".join(l)
In [31]:
df['proccesed_text'] = df.lemma.apply(lambda x: stopwords(x))
In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
In [33]:
tdf = TfidfVectorizer(max_df=0.9, min_df=10)
In [34]:
dtm = tdf.fit_transform(df["proccesed_text"])
In [35]:
from sklearn.decomposition import LatentDirichletAllocation
In [36]:
lda = LatentDirichletAllocation(n_components= 10, random_state= 0)
In [37]:
lda.fit(dtm)
Out[37]:
LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=10, n_jobs=None,
                          perp_tol=0.1, random_state=0, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)
In [38]:
for i, topic in enumerate(lda.components_):
    print(f"Top 20 words for topic {i}")
    print([tdf.get_feature_names()[index] for index in topic.argsort()[-20:]])
    print("\n")
Top 20 words for topic 0
['thank', 'flightr', 'jetblue', 'hold', 'dfw', 'need', 'unite', 'flightled', 'southwestair', 'late', 'tomorrow', 'delay', 'help', 'hour', 'usairway', 'flightle', 'americanair', 'cancel', 'flight', 'pron']


Top 20 words for topic 1
['update', 'late', 'time', 'leave', 'min', 'jetblue', 'connection', 'miss', 'americanair', 'sit', 'wait', 'gate', 'usairway', 'hour', 'unite', 'united', 'delay', 'plane', 'flight', 'pron']


Top 20 words for topic 2
['phone', 'need', 'service', 'southwestair', 'agent', 'time', 'luggage', 'lose', 'help', 'united', 'unite', 'minute', 'wait', 'bag', 'hour', 'flight', 'americanair', 'usairway', 'hold', 'pron']


Top 20 words for topic 3
['co', 'airline', 'continue', 'airway', 'win', 'policy', 'lie', 'fail', 'snow', 'americanair', 'united', 'boarding', 'jetblue', 'usairway', 'pass', 'suck', 'flight', 'unite', 'southwestair', 'pron']


Top 20 words for topic 4
['look', 'far', 'time', 'phone', 'online', 'fee', 'united', 'bag', 'check', 'change', 'seat', 'southwestair', 'americanair', 'jetblue', 'booking', 'usairway', 'unite', 'problem', 'flight', 'pron']


Top 20 words for topic 5
['ok', 'reply', 'thanks', 'great', 'respond', 'help', 'email', 'appreciate', 'usairway', 'united', 'response', 'americanair', 'jetblue', 'follow', 'send', 'unite', 'dm', 'southwestair', 'pron', 'thank']


Top 20 words for topic 6
['credit', 'amazing', 'tell', 'thank', 'money', 'experience', 'united', 'know', 'fly', 'terrible', 'care', 'flight', 'usairway', 'unite', 'americanair', 'southwestair', 'jetblue', 'service', 'customer', 'pron']


Top 20 words for topic 7
['sunday', 'usairway', 'new', 'thank', 'look', 'love', 'united', 'americanair', 'good', 'flight', 'southwestair', 'rt', 'fleet', 'fleek', 'virginamerica', 'pron', 'ðÿ', 'jetblue', 'http', 'co']


Top 20 words for topic 8
['crew', 'lax', 'soon', 'love', 'southwest', 'oh', 'wifi', 'great', 'usairway', 'united', 'guy', 'thank', 'americanair', 'fly', 'yes', 'flight', 'unite', 'jetblue', 'southwestair', 'pron']


Top 20 words for topic 9
['website', 'book', 'phone', 'southwestair', 'change', 'mile', 'day', 'reservation', 'united', 'hang', 'flight', 'unite', 'customer', 'service', 'airline', 'try', 'bad', 'americanair', 'usairway', 'pron']


In [39]:
topic = lda.transform(dtm)
In [40]:
df["topic"] = topic.argmax(axis = 1)
In [41]:
df.head()
Out[41]:
negativereason airline tweet_location retweet_count text sentiment lemma proccesed_text topic
0 NaN Virgin America NaN 0 @VirginAmerica What @dhepburn said. Neutral [@virginamerica, what, @dhepburn, say, .] @virginamerica @dhepburn . 7
1 NaN Virgin America NaN 0 @VirginAmerica plus you've added commercials t... Neutral [@virginamerica, plus, -PRON-, have, add, comm... @virginamerica plus -PRON- add commercial expe... 3
2 NaN Virgin America Lets Play 0 @VirginAmerica I didn't today... Must mean I n... Neutral [@virginamerica, i, do, not, today, ..., must,... @virginamerica today ... mean need trip ! 9
3 Bad Flight Virgin America NaN 0 @VirginAmerica it's really aggressive to blast... Negative [@virginamerica, -PRON-, be, really, aggressiv... @virginamerica -PRON- aggressive blast obnoxio... 3
4 Can't Tell Virgin America NaN 0 @VirginAmerica and it's a really big bad thing... Negative [@virginamerica, and, -PRON-, be, a, really, b... @virginamerica -PRON- big bad thing -PRON- 8
In [42]:
df.drop(columns=['lemma', 'proccesed_text'], axis =1, inplace = True)
In [43]:
df.head()
Out[43]:
negativereason airline tweet_location retweet_count text sentiment topic
0 NaN Virgin America NaN 0 @VirginAmerica What @dhepburn said. Neutral 7
1 NaN Virgin America NaN 0 @VirginAmerica plus you've added commercials t... Neutral 3
2 NaN Virgin America Lets Play 0 @VirginAmerica I didn't today... Must mean I n... Neutral 9
3 Bad Flight Virgin America NaN 0 @VirginAmerica it's really aggressive to blast... Negative 3
4 Can't Tell Virgin America NaN 0 @VirginAmerica and it's a really big bad thing... Negative 8
In [44]:
reason = df.negativereason.value_counts()
In [45]:
fig = go.Figure()

fig.add_trace(go.Bar(x = reason.index,
                     y = reason))
In [46]:
df.retweet_count.unique()
Out[46]:
array([ 0,  1,  2,  3,  4,  5,  7, 22,  6, 18, 15, 31, 11,  8,  9, 28, 32,
       44])

Word cloud

In [47]:
df_neg = df[df['sentiment'] == 'Negative']
df_neg.head()  
Out[47]:
negativereason airline tweet_location retweet_count text sentiment topic
3 Bad Flight Virgin America NaN 0 @VirginAmerica it's really aggressive to blast... Negative 3
4 Can't Tell Virgin America NaN 0 @VirginAmerica and it's a really big bad thing... Negative 8
5 Can't Tell Virgin America NaN 0 @VirginAmerica seriously would pay $30 a fligh... Negative 8
8 NaN Virgin America San Diego 0 @virginamerica Well, I didn't…but NOW I DO! :-D Negative 7
10 NaN Virgin America 1/1 loner squad 0 @VirginAmerica did you know that suicide is th... Negative 6
In [48]:
df_neg.to_csv (r'negativeTweets.csv', index = False, header=True)
In [49]:
import collections
import numpy as np
import pandas as pd
import matplotlib.cm as cm
import matplotlib.pyplot as plt
from matplotlib import rcParams
from wordcloud import WordCloud, STOPWORDS
%matplotlib inline
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk import word_tokenize
In [50]:
def removeAirline (userTyped):
    tokens = word_tokenize(userTyped)
    res = ''
    if tokens[0] == '@':
        tokens.pop(0)
        tokens.pop(0)
        res = TreebankWordDetokenizer().detokenize(tokens)
    else:
        res = userTyped
    return res
In [51]:
def wordCloud_res(reason):
    df_res = df_neg[df_neg['negativereason'] == reason]
    all_headlines = ' '
    for line in df_res['text'].str.lower():
        all_headlines += removeAirline(line)
    stopwords = STOPWORDS
    stopwords.add('will')
    wordcloud = WordCloud(stopwords=stopwords, background_color="white", max_words=500).generate(all_headlines)
    
    print('With the reason of: ' + reason)
    rcParams['figure.figsize'] = 10, 20
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()
In [52]:
reason = df_neg['negativereason'].unique()
reason
Out[52]:
array(['Bad Flight', "Can't Tell", nan, 'Late Flight',
       'Customer Service Issue', 'Flight Booking Problems',
       'Lost Luggage', 'Flight Attendant Complaints', 'Cancelled Flight',
       'Damaged Luggage', 'longlines'], dtype=object)

Most reason of negative sentiment is about:

'Bad Flight', 'Late Flight','Customer Service Issue', 'Flight Booking Problems','Lost Luggage', 'Flight Attendant Complaints', 'Cancelled Flight', 'Damaged Luggage', 'longlines'

These will be the key words for the robot to analyze

In [53]:
wordCloud_res(reason[0])
With the reason of: Bad Flight
In [54]:
wordCloud_res(reason[3])
With the reason of: Late Flight
In [55]:
wordCloud_res(reason[4])
With the reason of: Customer Service Issue
In [56]:
wordCloud_res(reason[5])
With the reason of: Flight Booking Problems
In [57]:
wordCloud_res(reason[6])
With the reason of: Lost Luggage
In [58]:
wordCloud_res(reason[7])
With the reason of: Flight Attendant Complaints
In [59]:
wordCloud_res(reason[8])
With the reason of: Cancelled Flight
In [60]:
wordCloud_res(reason[9])
With the reason of: Damaged Luggage
In [61]:
wordCloud_res(reason[10])
With the reason of: longlines
In [ ]: